Appendix 



IMPROVED METHODS AND APPARATUS FOR FAST FOURDER TRANSFORM 

The invention provides improved methods and apparatus for fast fourier transform. 

From the user's perspective, the code performs an in-place "split-complex" ID FFT (forward or 
inverse) for power of 2 sizes ranging from 1 6 to 4096, inclusive. 

There are 3 user-callable functions: fft_setupO, fft_z() and fft_free(): 

void fit_setup ( unsigned long LOG2N, FFTjsetup *SETUP ); 

void fft_z ( float *Creal, float *Cimag, unsigned long LOG2N, FFT_setup *SETUP V 

void fft_free ( FFT_setup *SETUP ); 

FFT_setup is a structure defined as follows: 

typedef struct { 
float *twidp; /* pointer to 1 6-byte aligned 

malloc'ed twiddle buffer */ 

unsigned char *bitrp; /* pointer to static bit-reversal 
table */ 
}FFT_setup; 

A user first calls fft_setupO specifying a particular FFT size (actually, the base 2 log of the size) 
along with a pointer to an uninitialized'FFT_setup structure. This function allocates (malloc) and 
builds the appropriate "twiddle" table and places a pointer to this table and the appropriate bit- 
reversal table (a static table) in the FFT_setup structure supplied by the caller. 

Next, fft_z() can be called repeatedly for the same size FFT as was specified in the 
corresponding call to fft_setup(). The user must also specify the same FFTjsetup structure that 
was filled in by that call. The input/output vectors are supplied in a split-complex format with 
the real parts contiguous in the first float vector argument (Creal) and the corresponding 
imaginary parts contiguous in the second float vector argument (Cimag). The call performs a 
forward FFT. To perform an inverse FFT, simply interchange the real and imaginary vectors 
(i.e., specify the imaginary vector in the first argument and the real vector in the second 
argument). • / 

Finally, the user calls fftfreeO to free the twiddle buffer previously allocated and constructed by 
fft_setup(). The user must specify the same FFT_setup structure to both calls. 

Here is a one line description of what is in each file: 

fifth: user's header file 

fft_bitr: contains static bit-reversal tables for all 9 FFT sizes (16- 4096) 
fft_setup.c source for ffl_setupO and fft_free0 
fft_z.c source for fft_z() 

ppc_vmx.h: macro header file for VMX (altivec) emulation of SIMD instructions. 
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ppc_vmx.c: contains C functions that emulate VMX (altivec) SIMD instructions 

Note that fft_z() is implemented using macros that emulate VMX SIMD instructions. There is a 
structure (VMXjeg) defined in ppc_vmx.h that emulates a 16-byte VMX SIMD register. The 
floating point variables used in fft_z() are of this type. fft_z.c does *not* contain an optimized 
PPC G4 implementation of fft_z() insofar as the instructions are *not* ordered in an optimal way 
for that processor. However, the primary patent claim is clearly demonstrated in the final pass of 
the FFT which begins on line 661 of fft_z.c. This section performs the final radix-4 in-place pass 
of the FFT but manages to leave the results correctly ordered in the real and imaginary 
input/output vectors. This can be accomplished with 32 or fewer 16-byte "registers" (i.e., 512 or 
fewer bytes of temporary storage). 

It will be appreciated that the teachings hereof may be applied using different programming 
languages, toolsets, operating systems, platforms and otherwise. 
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' File Name: fft_bitr.c 

' Description: Special bit-reversed tables for FFT sizes 
4 <= L0G2N <= 12 

Let: L0G2M = L0G2N - 4 
M = 2 A LOG2M 

For each table: 

section 1: 

nl = bitr[0] = # of elements in section 1 

(The first and second elements are not in the table 

as they, are known to be 0 and M-l, respectively.) 

0, M-l, bitr[l],' bitr[nl-2] = 

indices that bit-reverse to themselves 

.. section 2: 

n2 = bitr[nl-l] = # of elements in section 2 
It's always true that nl + n2 = M. 
(The first element is not in the table and, if 
n2 != 0, is known to be 1.) 

(1, bitr[nl]), (bitrfnl+1], bitr[nl+2]), 
(bitr[M-3], bitr[M-2]) = n2/2 pairs of indices that 
bit-reverse to each other. bitr[M-l] = 0. 

Mercury Computer Systems, Inc. 
Copyright (c) 1996 All rights reserved 

Revision Date Engineer; Reason 



0.0- 990716 jg; Created 



/* 

* Table f or' M = 1 (N = 16) . 
*/ 

unsigned char _fft_bitr_l[] = { 
0, 0, 0 / 

); 

/* ' 

* Table for M = 2 (N = 32) . 
*/ 

unsigned char _fft_bitr 2[] = { 
2, 

0, 0, 0 



/* 

* Table for M = 4 (N = 64) . 
*/ 

unsigned char _f ft_bitr_4 [ ] 
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2, 

2, 2, 0 

}; 



/* 

* Table for M = 8 (N = 128) . 
*/ 

unsigned char _f ft_bitr_8 [] = { 
4, 2, 5, 
4, 4, 3, 6, 0 

>; 
/* 

* Table for M = 16 (N = 256) . 

Unsigned char _f ft_bitr_16 [J = { 
\, 6, 9, 

12, 8, .2, 4, 3, 12, 5, 10, 7, 14, 11, 13, 0 

}; 
/* 

* Table for M = 32 (N = 512) . 
*/ 

unsigned char _fft_bitr_32 [J = { 

8, 4, 10, 14, 17, 21, 27, 

24, 16, 2, 8, 3, 24, 5, 20, 6, 12, 7, 28, 

9, 18, 11, 26, 13, 22, 15, 30, 19, 25, 23, 29, 0 

}; 
/* 

* Table for M = 64 (N = 1024). 
*/ 

unsigned char _ff t_bitr_64 [] = { 
8, 12, 18, 30, 33, 45, 51, 
56, 32, 2, 16, 3, 48, 4, 8, 5, 40, 6, 24, 
7, 56, 9, 36, 10, 20, 11, 52, 13, 44, 14, 28, 
15, 60, 17, 34, 19, 50, 21, 42, 22, 26, 23, 58, 

25, 38, 27, 54, 29, 46, 31, 62, 35, 49, 37, 41, 
39, 57, 43; 53, 47, 61, 55, 59, 0 

}; 



* Table fj&r M = 128 (N = 2048). 

*/ 

unsigned char _f ft_bitr_128 [] = { 

16, 8, 20, 28, 34, 42, 54, 62, 65, 73, 85, 93, 99, 107, 119, 

112, 64, 2, 32, 3, 96, 4, 16, 5, 80, 6, 48, 7, 112, 9, 72, 

10, 40, 11, 104, 12, 24, 13, 88, 14, 56, 15, 120, 17, 68, 18, 36, 

19, 100, 21, 84, 22, 52, 23, 116, 25, 76, 26, 44, 27, 108, 29, 92, 

30, 60, 31, 124, 33, 66, 35, 98, 37, 82, 38, 50, 39, 114, 41, 74, 

43, 106, 45, 90, 46, 58, 47, 122, 49, 70, 51, 102, 53, 86, 55, 118, 

57, 78, 59, 110, 61, 94, 63, 126, 67, 97, 69, 81, 71, 113, 75, 105, 

77, 89, 79, 121, 83, 101, 87, 117, 91, 109, 95, 125, 103, 115, 111, 123, 0 

}; 

/* 

* Table for M = 256 (N = 4096) . 
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*/ 

unsigned char _fft_bitr_256 [] = { 

16, 24, 36, 60, 66, 90, 102, 126, 129, 153, 165, 189, 195, 219, 231, 
240, 128, 2, 64, 3, 192, 4, 32, 5, 160, 6, 96, 7, 224, 8, 16, 
9, 144, 10, 80, 11, 208, 12, 48, 13, 176, 14, 112, 15, 240, 17, 136, 
18, 72, 19, 200, 20, 40, 21, 163, 22, 104, 23, 232, 25, 152, 26, 88, 
27, 216, 28, 56, 29, 184, 30, 120, 31, 248, 33, 132, 34, 68, 35, 196, 
37, 164, 38, 100, 39, 228, 41, 148, 42, 84, 43, 212, 44, 52, 45, 180, 
46, 116, 47, 244, 49, 140, 50, 76, 51, 204, 53, 172, 54, 108, 55, 236, 
57, 156, 58, 92, 59, 220, 61, 188, 62, 124, 63, 252, 65, 130, 67, 194 
69, 162, 70, 98, 71, 226, 73, 146, 74, 82, 75, 210, 77, 178, 78, 114,' 
79, 242, 81, 138, 83, 202, 85, 170, 86, 106, 87, 234, 89, 154, 91, 218, 
93, .186, 94, 122, 95, 250, 97, 134, 99, 198, 101, 166, 103, 230, 105, 150 

238 107 ' 214 ' 109 ' 182 ' 110 ' 118 ' U1 ' 246 ' 113 ' 142 ' 115 ' 206 ' 117 ' 174 ' 119 ' 

121, 158,. 123/' 222, 125., 190, 127, 254, '131, 193, 133, 161, 135, 225, 137," 
145, 

139, 209, 141, 177, 143, 241, 147, 201, 149, 169, 151, 233,' 155, 217, 157 
185, 

237 159 ' 249 ' 163 ' 197 ' 167 ' 229 ' 171 ' 213 ' 173 ' 181 ' 175 ' 245, 179, 205, 183, 

187, 221, 191, 253, 199, 227, 203, 211, 207, 243, 215, 235, 223, 251, 239, 



/ 
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File Name: fft_setup.c 

Description: Setup for fft_z (split complex in-pl 
Entry/params : void fft_setup ( ulong L0G2N, 

FFT_setup *SETUP ) 
Entry/params: void fft_free ( FFT_setup *SETUP ) 



L0G2N is the log (base 2) of the FFT size. 
(4 <= L0G2N <= 12) 

Let: N = 2 A L0G2N 

L0G2M/ = L0G2N - 4 

M = i- A L0G2M . : 

A = 2 * PI / N 

BITR( i, m ) = bit-reversal of unsigned integer i 
over m bits 

void fft_setup ( ulong L0G2N, FFT_setup *SETUP ) 

SETUP->twidp is set to an allocated buffer that is 

16-byte aligned and contains M sets of 4 x 4 floating 
point twiddles arranged exactly as follows: 

cos(kA), cos((k+l)A), cos((k+2)A), cos((k+3)A), 
sin(kA), sin((k+l)A), sin((k+2)A), sin((k+3)A), 
cos(2kA), cos(2(k+l)A), cos (2 (k+2) A) , cos (2 (k+3)A) , 
sin(2kA), sin (2 ( k+1) A) , sin (2 (k+2) A) ', sin(2(k+3)A) 

for k = 0 

cos(kA), cos((k+l)A), cos((k+2)A) < cos((k+3)A), 
tan(kA), tan((k+l)A), tan((k+2)A), tan((k+3)A), 
cot(2kA), cot (2 (k+1) A), cot (2 (k+2) A) , cot (2 (k+3) A) , 
sin(2kA), sin(2 (k+1) A) , sin (2 (k+2) A) , sin(2(k+3)A) 

fork = 4 * BITR( 1, L0G2M ), 
4 * BITR( 2, L0G2M ), 

/ 4 * BITR ( M-2, L0G2M ) 

./ 

cos(kA), cos ((k+1) A), cos((k+2)A), co's ( (k+3) A) , 
sin(kA), sin((k+l)A), sin((k+2)A), sin((k+3)A), 
cos(2kA), cos(2(k+l)A), cos (2 (k+2) A) , cos (2 (k+3) A) , 
sin(2kA), sin(2 (k+l)A) , sin (2 (k+2) A) , sin(2(k+3)A) 

for k = 4 * (M - 1) 

SETUP->bitrp is set to static table of M unsigned char 
bit-reversed index values (LOG2M bits) arranged 
as follows: 

section 1: 

nl = bitrp[03 = # of elements in section 1 

(The first and second elements are not in the table 
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as they are known to be 0 and M-l, respectively.) 

0, M-l, bitrp [1], bitrplnl-2] = 

indices that bit-reverse to themselves 

section 2: 

n2 = bitrp[nl-l] = # of elements in section 2 
It's always true that nl + n2 = M. 
(The first element is not in the table and, if 
n2 != 0, is known to be 1.) 

(1, bitrpfnl]), (bitrp [nl+1] , bitrp[nl+2] ) , 
{bitrp [M-3] , bitrp [M-2] ) - n2/2 pairs of indices that 
bit-reverse to each other. bitrp[M-l] = 0. 

void fft_free '( FFT_setup. *SETUP )' ' ' 

frees SETUP->twidp and sets SETUP->twidp and 
SETUP->bitrp to 0 

Mercury Computer Systems, Inc. 
Copyright (c) 1999 All rights reserved 

Revision Date Engineer; Reason 

0.0 991119 jg; Created 



#include <malloc.h> 
frinclude <math.h> 
# include "fft.h" 
finclude "ppc_vmx.h" 

#define TWOPI (double) 6. 2831853071795864769252868 
#define BITR( log2x, index, bitr_index ) \ 
{ \ 

ulong _bitr_i, _bitr_x; \ 
_bitr_x = (index) ; \ 
bitr_index = 0; \ 

for ( _bitr_i = 0; _bitr_i < (log2x); _bitr_i++ ) { \ 
bitr_index «= 1; \ 
bitr_index |= (_bitr_x & 1); \ 
_b£tr_x »= 1; \ 

} \ 

}_ 

extern uchar _f f t_bitr_l [ ] ; 
extern uchar _f f t_bitr_2 [ ] ; 
extern uchar _f ft_bitr_4 [] ; 
extern uchar _f ft_bitr_8 [] ; 
extern uchar _f f t_bitr_16 □ 
extern uchar _f ft_bitr_32 [] 
extern uchar _f ft_bitr_64 [] 
extern uchar _ff t_bitr_128 [] ; 
extern uchar _f ft_bitr_256 [] ; 

void fft_setup( ulong L0G2N, FFT_setup *SET0P ) 
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char **mallocp; 
char *buffer; 
float *twidp; 

ulong bitr_i, i, j, log2n_m4, n, nvl6; 

double angle, cosl, cos2, delta, incr, sinl, sin2, twopivn; 
n = 1 « L0G2N; 

buffer = malloc( (n * sizeof (float) ) +20 ); 
if ( ! buffer ) { 

SETUP->twidp = (float *)0; 

return; 

> . /• . ... 

twidp = (float *) ((ulong) (buffer + 20) & -15); 
mallocp = (char **) (twidp - 1); 
*mallocp = buffer; 

nvl6 = n » 4; 
log2n_m4 = L0G2N - 4; 
twopivn «= TWOPI / (double) n; 
delta = ( double) 0.0; 

for ( i = 0; i < nvl6; i++ ) { 
for ( j = 0; j < 4; j++ ) { 
incr = delta; 
angle = twopivn * incr;. . 
cosl = cos (angle); 
sinl = sin (angle); 
incr += delta; 
angle = twopivn * incr; 
. cos2 = cos (angle) ; 
sin2 = sin (angle); 

if ( ( i == 0 ) || ( i == (nvl6 - 1) ) ) { 
twidp [ (i « 4) + j] = (float) cosl; 
twidp [(i « 4) + j + 4] = ( float) sinl; 
twidp [(i « 4) + j + 8] = (float)cos2; 
twidp [(i « 4) + j + 12] = (float) sin2; 

} / 
else { 

BITR( log2n_m4, i, bitr_i ) 

twidp [ (bitr_i « 4) + j] = (float) cosl; 

twidp [ (bitr_i « 4) + j + 4] - (float) (sinl / cosl); 

twidp [ (bitr_i « 4) + j + 8] = (float) (cos2 / sin2); 

twidp [ (bitr_i « 4) + j + 12] = ( float )sin2; 

) 

delta += (double) 1.0; 

} 

} 

SETUP- >twidp - twidp; 
if ( L0G2N == 4 ) 

SETUP->bitrp = _fft_bitr 1; 
else if ' ( L0G2N == 5 ) 



SETUP->bitrp = 
else if ( L0G2N == 

SETUP->bitrp 
else if ( L0G2N 

SETUP->bitrp 
else if ( L0G2N 

SETUP->bitrp 
else if { L0G2N 

SETQP->bitrp 
else if ( L0G2N 

SETUP->bitrp 
else if ( L0G2N 

SETUP->bitrp 
else if ( L0G2N 

SETUP->bitrp 
return; 



fft_bitr_2; 
= 6 ) 
= _fft_bitr_4; 
== 7 ) 

= _fft_bitr_8; 
== 8 ) 

= _fft_bitr^l6; 
== 9 ) 

= _fft_bitr_32; 
== 10 ) 

= _fft_bitr_64; 
== 11 ) 

= _fft_bitr_128; 
== 12 ) 

= fft bitr 256; 



void fft_free( FFT_setup *SETUP 



char **mallocp; 

if ( (SETUP->bitrp ■ 
(SETUP->bitrp > 
(SETUP->bitrp - 
(SETUP~>bitrp ■ 
(SETUP->bitrp ■ 
(SETUP->bitrp = 
( SETUP- >bitrp ■ 
(SETUP->bitrp « 
(SETUP->bitrp ■ 
mallocp = (char • 
free ( *mallocp ) ; 



SETUP->twidp = (float *)0; 
SETOP->bitrp = (uchar *)0; 
return; 



= _fft_bitr_l) [ , 
= _fft_bitr_2) 
= _fft_bitr_4) 
= _fft_bitr_8) | I 
= _fft_bitr_16) 
= _f f t_bitr_32 ) 
= _fft_bitr_64) 
= _fft_bitr_128) 
= _fft_bitr_256) 
*) (SETUP->twidp - 



/ 
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File Name: fft_z.c 

Description: Forward (or Inverse) Complex In-place ID FFT 
Entry /params: void fft_z ( float *Cr, float *Ci, 

ulong LOG2N, FFT_setup *SETUP ) 



Cr/Ci = 2 A LOG2N-point (4 <= L0G2N <= 12) forward in-place 
complex Id FFT of the split complex vector stored 
in Cr and Ci. 

(Note, an inverse FFT can be performed by swapping 
Cr and qi . ) 



Cr and Ci must be 16-byte aligned and have unit stride 
stride between adjacent real (Cr) and imaginary (Ci) 
points . 

L0G2N is the log (base 2) of the FFT size. 
(4 <= L0G2N <= 12) 

Let: N = 2 A L0G2N 

L0G2M = L0G2N - 4 
M = 2 A L0G2M 
A = 2 * PI / N 

BITR ( i, m ) = bit-reversal of unsigned integer i 
over m bits 

SETUP->twidp is a 16-byte aligned pointer to M sets 
of 4 x 4 floating point twiddles arranged exactly 
as follows : 

cos(kA), cos( (k+l)A) , cos((k+2)A), cos((k+3)A), 
sin(kA), sin((k+l)A), sin((k+2)A), sin((k+3)A), 
cos(2kA), cos(2(k+l)A) , cos (2 (k+2) A) , cos (2 (k+3) A) , 
sin(2kA), sin (2 (k+1) A) , sin (2 (k+2) A) , sin(2(k+3)A) 

fork = 0 

cos(kA), cos ((k+1) A), cos((k+2)A), cos ((k+3) A), 
tan(kA), tan((k+l)A), tan((k+2)A), tan((k+3)A), 
cot(2kA), cot(2(k+l)A) , cot (2 (k+2) A) , cot (2 (k+3) A) , 
sin(2kA), sin(2(k+l)A), sin (2 (k+2) A) , sin(2(k+3)A) 

for k = 4 * BITR( 1, L0G2M ), 
4 * BITR( 2, L0G2M ), 

4 * BITR ( M-2, L0G2M ) 

cos(kA), cos ((k+1) A), cos ((k+2) A), cos ((k+3) A), 
sin(kA), sin((k+l)A), sin((k+2)A), sin((k+3)A), 
cos(2kA), cos (2 (k+1) A) , cos (2 (k+2) A) , cos (2 (k+3) A) , 
sin(2kA), sin(2(k+l)A) , sin(2 (k+2)A) , sin(2(k+3)A) 
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for k = 4 * (M - 1) 

SETUP->bitrp is a pointer to M unsigned char 

bit-reversed index values (L0G2M bits) arranged 
as follows: 

section 1: 

nl = bitrpfO] = # of elements in section 1 

(The first and second elements are not in the table 

as they are known to be 0 and M-l, respectively.) 

0, M-l, bitrp[l], bitrp[nl-2] = 

indices that bit-reverse to themselves 

/ . • 

section 2: .... 

n2 = bitrp[nl-l] = # of elements in section 2 
It 1 s always true that nl + n2 = M. 
(The first element is not in the table and, if 
n2 != 0, is known to be 1.) 

(1, bitrptnl]), (bitrp[nl+l], bitrp{nl+2] ) , 
(bitrpIM-3], bitrp[M-2]) - n2/2 pairs of indices that 
bit-reverse to each other. bitrp[M-l] = 0. 

Mercury Computer Systems, Inc. 
Copyright (c) 1999 All rights reserved 



Revision Date 
0.0 991119 



Engineer; Reason 
jg; Created 



#include "fft.h" 
#include "ppc_vmx.h" 



void fft_z ( float *Cr, float *Ci, ulong LOG2N, FFT_setup *SETUP 

( /" 

float *Crl, *Cil, *Cr2, *Ci2, *Cr3, *Ci3; 

float *Cr4, *Ci4, *Cr5, *Ci5, *Cr6, *Ci6, *Cr7, *Ci7; 

float *wp0, *wpl, *wp2, *wp3; 

unsigned .char *bitrp; 

ulong index, index_bump, indexl, index2, windex; 
ulong bflycnt, bflyoff, gent, sent, N; 

VMX_reg aOr, aOi, air, ali, a2r, a2i, a3r, a3i 

VMX_reg yOr, yOi/ ylr, yli, y2r, y2i, y3r, y3i 

VMX_reg tlr, tli, t2r, t2i, m2r, m2i, m3r, m3i. 

VMX_reg pOr, pOi, plr, pli, p2r, p2i, p3r, p3i, 

VMX_reg xlr, xli, x2r, x2i; 

VMX_reg • cosl, sinl, cos2, sin2, tanl, cot2; 
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VMX reg 


aOr 8, 


aOi 8, 


air 8, 


ali 8, 


a2r_8 


a2i 8 


a3r 8 


a3i 8; 


VMX reg 


a4r_8, 


a4i 8, 


a5r 8, 


a5i 8, 


a6r_8 


a6i 8 


a7r 8 


a7i 8; 


VMX reg 


yOr_8 , 


yOi 8, 


ylr 8, 


yli 8, 


y2r_8 


y2i_8 


y3r 8 


y3i 8; 


VMX reg 


y4r 8, 


y4i 8, 


y5r 8, 


y5i 8, 


y6r 8 


y6i 8 


y7r 8 


y7i 8; 


VMX reg 


tlr 8, 


tli 8, 


t2r 8, 


t2i 8, 


t-3r 8 


t3i 8 


t4r 8 


t4i 8; 


VMX_reg 


t5r 8, 


t5i 8, 


t6r 8, 


t6i 8, 


t7r_8 


t7i_8 


t8r_8 


t8i 8; 


VMX reg 


dlr 8, 


dli 8, 


d2r 8, 


d2i 8, 


m2r 8 


m2i 8, 


m5r 8 


m5i 8; 


VMX_reg 


slr_8, 


sli_8, 


s2r_8, 


s2i 8, 


s3r 8 


s3i 8, 


s4r_8, 


s4i 8; 


VMX_reg 
/* 


em4 r_8 


em4i_ 


3, em7r_ 


_8, em7i_8, rad2v2; 







* here if N >= 16 
*/ 

wpO = SETUP->twidp; 
wpl = wpO .+ 4,y 
wp2 = wpO + 8; 
wp3 = wpO + 12; 
bitrp = SETUP->bitrp; 



if ( LOG2N & 1 ) { 



/* radix-8 first pass */ 



windex = 64; 

LVEWX( rad2v2, wpO, windex ) 

bflyoff = N » 1; 

VSPLTW( rad2v2, rad2v2, 0 ) 



/* cos (PI/4) = sqrt{2)/2 */ 
/* 4 * N/8 = N/2 byte offset */ 
/* replicate 4 times */ 



Crl - (float *) ((char *)Cr + bflyoff); 

Cil = (float *)((char *)Ci + bflyoff); 

Cr2 = (float *)((char *)Crl + bflyoff); 

Ci2 = (float *){(char *)Cil + bflyoff); 

Cr3 = (float *) ((char *)Cr2 + bflyoff); 

Ci3 = (float *) ((char *)Ci2 + bflyoff); 

Cr4 = (float *)((char *)Cr3 + bflyoff); 

Ci4 = (float *)((char *)Ci3 + bflyoff); 

Cr5 = (.float *)((char *)Cr4 + bflyoff); 

Ci5 = (float *)((char *)Ci4 + bflyoff); 

Cr6 = (float *) ((char *)Cr5 + bflyoff); 

Ci6 = (float *)((char *)Ci5 + bflyoff); 

Cr7 = /float *) {(char *)Cr6 + bflyoff); 

Ci7 =./(float *) ((char *)Ci6 + bflyoff); 



index = 0; 



bflycnt = bflyoff; 

while ( bflycnt ) { /* while ( index < bflyoff ) { */ 

LVX( a0r_8, Cr, index ) 

LVX( a0i_8, Ci, index ) 

LVX( alr_8, Crl, index ) 

LVX( ali_8, Cil, index ) 

LVX( a2r_8, Cr2, index ) 

LVX( a2i_8, Ci2, index ) 

LVX( a3r_8, Cr3, index ) 

LVX( a3i_8, Ci3, index ) 

LVX( a4r_8, Cr4, index ) 
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i 



LVX( a4i_8, Ci4, index ) 

LVX( a5r_8, Cr5, index ) 

LVX( a5i_8, Ci5,' index ) 

LVX( a6r_8, Cr6, index ) 

LVX( a6i_8, Ci6, index ) 

LVX( a7r_8, Cr7, index ) 

LVX( a7i_8, Ci7, index ) 

VADDFP ( tlr_8, a0r_8, a4r_8 

VSUBFP( dlr_8, a0r_8, a4r_8 

VADDFP ( tli_8, a0i_8, a4i_8 

VSDBFP( dli_8, a0i_8, a4i_8 

VADDFP { t3r_8, alr_8, a5r_8 
. VSUBFP( ,t4r_8, a5r_8, alr_8 
VADDFP '( t3i_8, ali_8, a5i_8 
VSUBFP( t4i_8, ali_8, a5i_8 

VADDFP ( t2r_8, a2r_8, a6r_8 

VSUBFP( d2r_8, a6r_8, a2r_8 

VADDFP { t2i_8, a2i_8, a6i_8 

VSUBFP( d2i_8, a2i_8, a6i_8 

VADDFP ( t5r_8, a3r_8, a7r_8 

VSUBFP( t6r_8, a7r_8, a3r_8 

VADDFP ( t5i_8, a3i_8, a7i_8 

VSUBFP( t6i_8, a3i_8, a7i_8 

VADDFP ( t7r_8, tlr_8, t2r^_8 

VSUBFP( m2r_8, tlr_8, t2r_8 

VADDFP ( t7i_8, tli_8, t2i_8 

VSUBFP( m2i_8, tli_8, t2i_8 

VADDFP ( t8r_8, t5r_8, t3r_8 

VADDFP ( t8i_8, t3i_8 f t5i_8 

VSUBFP( m5r_8, t3i_8, t5i_8 

VSCJBFP< m5i_8, t5r_8, t3r_8 

VADDFP ( y0r_8, t7r_8, t8r_8 
VADDFP ( y0i_8, t7i_8, t8i_8 
VADDFP ( y2r_8, m2r_8, m5r_8 
VADyDFP { y2i_8 , m2i_8, in5i_8 ; 

VSUBFP( y4r_8, t7r_8, t8r_8 ; 

VSUBFP( y4i_8, t7i_8, t8i_8 ] 

VSUBFP( y6r_8, ra2r_8, ra5r_8 ] 

VSUBFP{ y6i_8, m2i_8, m5i_8 ] 

VSUBFP( em4r_8, t6r_8, t4r_8 
VSUBFP( era4i_8, t4i_8, t6i_8 
VADDFP ( em7r_8, t4i_8, t6i_8 
VADDFP ( em7i_8, t6r_8, t4r_8 



VMADDFP ( slr_8, rad2v2, em4r_8, dlr_8 ) 
VMADDFP ( sli_8, rad2v2, em4i_8, dli_8 ) 
VNMSUBFP( s2r_8, rad2v2, em4r_8, dlr_8 ) 
VNMSUBFP( s2i 8, rad2v2, em4i_8, dli_8 ) 
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VMADDFP ( s3r_8, rad2v2, em7r_8, d2i_8 ) 
VMADDFP ( s3i_8, rad2v2, em7i_8, d2r_8 ) 
VNMSUBFP( s4r_8, rad2v2, em7r_8, d2i_8 ) 
VNMSUBFP( s4i_8, rad2v2, em7i_8, d2r_8 ) 



VADDFP ( ylr_8, slr_8, s3r_8 

VADDFP ( yli_8, sli_8, s3i_8 

VSDBFP{ y3r_8, s2r_8, s4r_8 

VSUBFP( y3i_8, s2i_8, s4i_8 



VADDFP ( y5r_8, s2r_8, s4r_8 

VADDFP ( y5i_8, s2i_8, s4i_8 

VSOBFP( y7r_8, slr_8, s3r_8 

VSUBFPJ ,y7i_8, sli 8, s3i_8 



STVX( yOr_8, 
STVX( yOi_8, 
STVX( y2r_8, 
STVX{ y2i_8, 
STVX( y4r_8, 
STVX( y4i_8, 
STVX( y6r_8, 
STVX( y6i_8, 
STVX( ylr_8, 
STVX( yli_8, 
STVX( y3r_8, 
STVX( y3i_8, 
STVX( y5r_8, 
STVX{ y5i_8, 
STVX( y7r_8, 
STVX( y7i_8, 



Cr, index ) 
Ci, index ) 

Cr2, index ) 

Ci2, index ) 

index ) 

index ) 

index ) 

index ) 

index ) 

index ) 

Cr6, index ) 

Ci6, index ) 

Cr5,. index ) 

Ci5, index ) 

Cr7, index ) 

Ci7, index ) 



/* bit-reverse output */ 



Crl 
Cil, 
Cr3, 
Ci3, 
Cr4, 
Ci4, 



index += 16; 
bflycnt -=16; 



bflyoff. = N; 

Crl -/(float 
Cil = (float 
Cr2 = (float 
Ci2 = (float 
Cr3 - (float 
Ci3 = (float 

index = 0; 



/* end radix-8 first pass */ 

/* radix-4 first pass */ 

/* 4 * N/4 = N byte offset */ 



) ( (char *)Cr + bflyoff); 
) ( (char *)Ci + bflyoff) ; 
) ((char *)Crl + bflyoff); 
) ((char *)Cil + bflyoff); 
) ( (char *)Cr2 + bflyoff); 
) ((char *)Ci2 + bflyoff); 



bflycnt = bflyoff; 

while ( bflycnt ) ( 

LVX( aOr, Cr, index ) 
LVX( aOi, Ci, index ) 
LVX( air, Crl, index ) 
LVX( ali, Cil, index ) 



/* while ( index < bflyoff ) { 
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LVX( a2r, Cr2, index } 

LVX( a2i, Ci2, index ) 

LVX( a3r f Cr3, index ) 

LVX( a3i, Ci3, index ) 



VADDFP ( tlr, aOr, a2r ) 

VADDFP ( tli, aOi, a2i ) 

VSDBFP( m2r, aOr, a2r ) 

VSUBFP( m2i, aOi, a2i ) 

VADDFP ( t2r, a3r, air ) 

VADDFP ( t2i, ali, a3i ) 

VSUBFP ( iti3r, ali, a3i ) 

VSUBFP( m3i, a3r, air ) 

VADDFP'( yOr, tlr,- t'2r ) 

VADDFP ( yOi, tli, t2i ) 

VADDFP ( ylr, m2r, m3r ) 

VADDFP ( yli, m2i, m3i ) 

VSUBFP( y2r, tlr, t2r ) 

VSUBFP( y2i, tli, t2i ) 

VSUBFP( y3r, m2r, m3r ) 

VSUBFP( y3i, m2i, m3i ) 

STVX( yOr, Cr, index ) 

STVX( yOi, Ci, index ) 

STVX( ylr, Cr2, index ) 

STVX( yli, Ci2, index ). 

STVX( y2r, Crl, index ) 

STVX( y2i, Cil, index ) 

STVX( y3r, Cr3, index ) 

STVX( y3i, Ci3, index ) 



/* bit-reverse output */ 



index += 16; 
bflycnt -= 16; 



while ( bflyoff > 64 ) {• 

index _bump = bflyoff; 
bflyoff »= 2; 
index_bump -= bflyoff; 



/* end radix-4 first pass */ 
/* middle stages */ 



/* decimate by 4 */ 
/* 3 * bflyoff */ 



Crl - (float *)((char *)Cr + bflyoff); 

Cil = (float *) ((char *)Ci + bflyoff); 

Cr2 = (float *)((char *)Crl + bflyoff); 

Ci2 = (float *)((char *)Cil + bflyoff); 

Cr3 = (float *)((char *)Cr2 + bflyoff); 

Ci3 = (float *)((char *)Ci2 + bflyoff); 



/* adjust pointers */ 



index > 



0; 



bflycnt = bflyoff; 
while ( bflycnt ) { 

LVX( aOr, Cr, index ) 



/* first (weightless) group ' 
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LVX( aOi, Ci, index ) 

LVX( air, Crl, index ) 

LVX( ali, Cil, index ) 

LVX( a2r, Cr2, index ) 

LVX( a2i, Ci2, index ) 

LVX( a3r, Cr3, index ) 

LVX( a3i, Ci3, index ) 



VADDFP ( tlr, aOr, a2r ) 

VADDFP ( tli, aOi, a2i ) 

VSUBFP( m2r, aOr, a2r ) 

VSUBFP( m2i, aOi, a2i ) 



VADDFP ( t2r, a3r, air ) 

VADDFP ( /t2i, ali-, a3i ) 

VSUBFP{ m3r, ali,- a3i ) 

VSUBFP( m3i, a3r, air ) 



VADDFP ( yOr, tlr, t2r ) 

VADDFP ( yOi, tli, t2i } 

VADDFP ( ylr, m2r, m3r ) 

VADDFP ( yli, m2i, m3i ) 



VSUBFP( y2r, tlr, t2r ) 

VSUBFP{ y2i, tli, t2i ) 

VSUBFP( y3r, m2r, m3r ) 

VSUBFP{ y3i, m2i, m3i ) 

STVX( yOr, Cr, index ) 
STVX( yOi, Ci, index ) 

STVX( ylr, Cr2, index ) 

STVX( yli, Ci2, index ) 

STVX( y2r, Crl, index ) 

STVX( y2i, Cil, index ) 

STVX( y3r, Cr3, index ) 

STVX( y3i, Ci3, index ) 

index +=16; 
bflycnt -= 16; 



/* bit-reverse output */ 



/* end of first (weightless) group */ 



windex/ = 64; 



gent = N - bflyoff; 
while ( gent ) ( 



/* loop for remaining groups */ 



/* 

* load weights for group 
*/ 

LVEWX( cosl, wpO, windex ) 
LVEWX( tanl, wpl, windex ) 
LVEWX( cot2, wp2, windex ) 
LVEWX( sin2, wp3, windex ) 

VSPLTW( cosl, cosl, 0 ) /* replicate 4 times */ 

VSPLTW( tanl, tanl, 0 ) 
VSPLTW( cot2, C0t2, 0 ) 
VSPLTW( sin2, sin2, 0 ) 
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index += index Jbump; 

bflycnt = bflyoff; 
while ( bflycnt ) { 

LVX( aOr, Gr, index ) 
LVX( aOi, Ci, index ) 
LVX( air, Crl, index ) 
LVX( ali, Cil, index ) 
LVX( a2r, Cr2, index ) 
LVX( a2i, Ci2 f index ) 
LVX( a3r, Cr3, index ) 
LVX( a3i, Ci3, index ) 
/ 

VMADDFP ( xlr / '-cot2, a2r, a2i ) 
VNMSUBFP( xli, cot2, a2i, a2r ) 
VMADDFP ( x2r, cot2, a3r, a3i ) 
VNMSUBFP( x2i, cot2, a3i, a3r ) 

VMADDFP ( tlr, sin2, xlr, aOr ) 
VNMSUBFPf tli, sin2, xli, aOi ) 
VMADDFP ( t2r, sin2, x2r, air ) 
VNMSUBFP( t2i, sin2, x2i, ali ) 

VNMSUBFP( m2r, sin2, xlr, aOr ) 
VMADDFP ( m2i, sin2, xli, aOi ) 
VNMSUBFP( m3r, sin2, x2r, air ) 
• . VMADDFP ( ra3i., sin2, x2i, all.) 

VMADDFP ( xlr, tanl, t2i, t2r ) 
VNMSUBFP( xli, tanl, t2r, t2i ) 
VNMSUBFP( x2r, tanl, m3r, m3i ) 
VMADDFP ( x2i, tanl, m3i, m3r ) 

VMADDFP ( yOr, cosl, xlr, tlr ) 
VMADDFP ( yOi, cosl, xli, tli ) 
VMADDFP ( ylr, cosl, x2r, m2r ) 
VNMSUBFP( yli, cosl, x2i, m2i ) 

VTNMSUBFP( y2r, cosl, xlr, tlr ) 
/TNMSUBFP( y2i, cosl, xli, tli ) 
• / VNMSUBFP( y3r, cosl, x2r, m2r ) 
VMADDFP ( y3i, cosl, x2i, m2i ) 



STVX( yOr, Cr, index ) 

STVX( yOi, Ci, index ) 

STVX( ylr, Cr2, index ) 

STVX( yli, Ci2, index ) 

STVX{ y2r, Crl, index ) 

STVX( y2i, Cil, index ) 

STVX( y3r, Cr3, index ) 

STVX( y3i, Ci3, index ) 



/* bit-reverse output */ 



index += 16; 
bflycnt -= 16; 
} ' /* end of butterfly loop 
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windex += 64; /* bump weight index */ 

gent -= bflyoff ; ... 
} ' /* end of group loop */ 

} /* end of stage loop */ 

if ( bflyoff == 64 ) { /* penultimate stage */ 

Crl - (float *)((char *)Cr + 16); /* adjust pointers * 

Cil = (float *)((char *)Ci + 16); 

Cr2 = (float *)((char *)Crl + 16); 

C±2. = (float *)((char *)Cil + 16); 

Cr3 = (float *). ((char *)Cr2 + 16); 

Ci3 = (float *)((char *)Ci2 + 16); 

index = 0; >' ■ /* same as windex */ 

/* 



first group (4 butterflies) is weightless 

7 

LVX( aOr, Cr, index ) 
LVX( aOi, Ci, index ) 
LVX( air, Crl, index ) 
LVX( ali, Cil, index ) 
LVX( a2r, Cr2, index ) 
LVX( a2i, Ci2, index ) 
LVX( a3r, Cr3, index ) 
LVX( a3i, Ci3, index ) 

VADDFP ( tlr, aOr, a2r ) • 
VADDFP { tli, aOi, a2i ) 
VSUBFP( m2r, aOr, a2r ) 
VSUBFP( m2i, aOi, a2i ) 

VADDFP ( t2r, a3r, air ) 

VADDFP ( t2i, ali, a3i ) 

VSUBFP( m3r, ali, a3i ) 

VSUBFP( m3i, a3r, air ) 

VADDFP ( yOr, tlr, t2r ) 

VADDFP ( yOi, tli, t2i ) 

VADDFP (-ylr, m2r, m3r ) 

VADDFBt yli, m2i, m3i ) 

VSUBFP( y2r, tlr, t2r ) 
VSUBFP{ y2i, tli, t2i ) 
VSUBFP( y3r, m2r, m3r ) 
VSUBFP( y3i, m2i, m3i ) 

STVX( yOr, Cr, index ) /* bit-reverse output */ 

STVX( yOi, Ci, index ) 

STVX( ylr, Cr2, index ) 

STVX( yli, Ci2, index ) 

STVX( y2r, Crl, index ) 

STVX( y2i, Cil, index ) 

STVX( y3r, Cr3, index ) 

STVX(. y3i, Ci3, index ) 
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/* 

* • loop for remaining butterflies except the very last 
*/ 

bflycnt = N - 32; 
while ( bflycnt ) { 

index += 64; • • 

/* 

* load weights for group 
*/ 

. LVEHX( cosl, wpO, index ) 
. LVEWX( tanl, wpl, index ) 
LVEWX( cot2, wp2, index ) 
■ LVEWX( sin2 t wp3, index ) . 

VSPLTW( cosl, cosl, 0 ) /* replicate 4 times */ 

VSPLTW( tanl, tanl, 0 ) 
VSPLTW( cot2, cot2, 0 ) 
VSPLTW{ sin2, sin2, 0 .) 

LVX( aOr, Cr, index ) 

LVX( aOi, Ci, index ) 

LVX{ air, Crl, index ) 

LVX( ali, Cil, index ) 

LVX( a2r, Cr2, index ) 

LVX( a2i, Ci2, index ) 

LVX( a3r, Cr3, index ) 

LVX{ a3i, Ci3, index ) 

VMADDFP ( xlr, cot2, a2r, a2i ) 
VNMSUBFP( xli, cot2, a2i, a2r ) 
VMADDFP ( x2r, cot2, a3r, a3i ) 
VNMSUBFP( x2i, cot2, a3i, a3r ) 

VMADDFP ( tlr, sin2, xlr, aOr ) 
VNMSDBFP ( tli, sin2, xli, aOi ) 
VMADDFP ( t2r, sin2, x2r, air ) 
VNMSDBFP ( t2i, sin2, x2i, ali ) 

VNMSDBFP ( m2r, sin2, xlr, aOr ) 
VMADDFP ( m2i, sin2, xli, aOi ) 
UBFP( m3r, sin2, x2r, air ) 
DFP( m3i, sin2, x2i, ali ) 

VMADDFP ( xlr, tanl, t2i, t2r ) 
VNMSUBFP( xli, tanl, t2r, t2i ) 
VNMSDBFP ( x2r, tanl, m3r, m3i ) 
VMADDFP ( x2i, tanl, m3i, m3r ) 

VMADDFP ( yOr, cosl, xlr, tlr ) 
VMADDFP ( yOi, cosl, xli, tli ) 
VMADDFP ( ylr, cosl, x2r, m2r } 
VNMSDBFP ( yli, cosl, x2i, m2i ) 

VNMSDBFP ( y2r, cosl, xlr, tlr ) 
VNMSDBFP ( y2i, cosl, xli, tli ) 
VNMSDBFP ( y3r, cosl, x2r, m2r ) 
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VMADDFP ( y3i, cosl, x2i, m2i ) 



STVX( yOr, Cr, index ) 
STVX( yOi, Ci, index ) 
STVX( ylr, Cr2, index ) 
STVX( yli, Ci2, index ) 
STVX( y2r, Crl, index ) 
STVX( y2i, Cil, index ) 
STVX( y3r, Cr3, index ) 
STVX( y3i, Ci3, index ) 

bflycnt -= 16; ■ 

} • 
/* 

* very last butterfly uses 
*/ 

index += 64; • • 

LVEWX( cosl, wpO, index ) 
LVEWX( sinl, wpl, index ) 
LVEWX( cos2, wp2, index ) 
LVEWX{ sin2, wp3, index ) 
VSPLTW( cosl, cosl, 0 ) 
VSPLTW( sinl, sinl, 0 ) 
VSPLTW( cos2, cos2, 0 ) 
VSPLTW( sin2, sin2, 0 ) 



/* bit-reverse output */ 



/* end of butterfly loop */ 
cosine/sine weights for accuracy 



/* replicate 4 times */ 



LVX( air, Crl, index ) 

LVX( ali, Cil, index ) 

LVX( a2r, Cr2, index ) 

LVX( a2i, Ci2, index ) 

LVX( a3r, Cr3, index ) 

LVX( a3i, Ci3, index ) 

LVX( aOr, Cr, index ) 

LVX( aOi, Ci, index ) 



VMADDFP { tlr, cos2, a2r, aOr ) 
VMADDFP S [ tli, cos2, a2i, aOi ) 
VNMSUBFP ( m2r, cos2, a2r, aOr ) 
VNMSUBFP ( m2i, cos2, a2i, aOi ) 

VMADDFP ( tlr, sin2, a2i, tlr ) 
VNMSUBFP ( tli, sin2, a2r, tli. ) 
VNMSUBFP( m2r, sin2, a2i, ro2r ) 
VMADDFP ( m2i, sin2, a2r, m2i ) 

VMADDFP ( t2r, cos2, a3r, air )' 
VMADDFP ( t2i, cos2, a3i, ali ) 
VNMSUBFP ( m3r, cos2, a3r, air ) 
VNMSUBFP ( m3i, cos2, a3i, ali ) 

VMADDFP { t2r, sin2, a3i, t2r ) 
VNMSUBFP ( t2i, sin2, a3r, t2i ) 
VNMSUBFP ( m3r, sin2, a3i, m3r ) 
VMADDFP ( m3i, sin2, a3r, m3i ) 
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VMADDFP ( yOr, cosl, t2r, tlr ) 
VMADDFP ( yOi, cosl, t2i, tli ) 
VNMSDBFP'( y2r, cosl, t2r, tlr ) 
VNMSUBFP( y2i, cosl, t2i, tli ) 

VMADDFP ( yOr, sinl, t2i, yOr ) 
VNMSUBFP( yOi, sinl, t2r, yOi ) 
VNMSUBFP( y2r, sinl, t2i, y2r ) 
VMADDFP ( y2i, sinl, t2r, y2i ) 

VNMSUBFP( ylr, sinl, m3r, m2r ) 
VNMStJBFP ( yli, sinl, m3±,..«2i ) 
VMADDFP ( y3r, sinl, m3r, m2r ) 
VMADDFP ( y3i, sinl, m3i, m2i ) 

/' 

VMADDFP { ylr, cosl, m3i, ylr }' 
VNMSUBFP( yli, cosl, m3r, yli ) 
VNMSDBFP ( y3r, cosl, m3i, y3r ) 
VMADDFP ( y3i, cosl, ra3r, y3i ) 



STVX( yOr, Cr, index ) 

STVX{ yOi, Ci, index ) 

STVX( ylr, Cr2, index ) 

STVX( yli, Ci2, index ) 

STVX( y2r, Crl, index ) 

STVX( y2i, Cil, index ) 

STVX( y3r, Cr3, index ) 

STVX( y3i, Ci3, index ) 

} 



/* 














final pass 






*/ 












Crl 




(float 


*) ( (char 


*)Cr 


+ N); 


Cil 




(float 


*) ((char 


*)Ci 


+ N); 


Cr2 




(float 


*) ((char 


*)Crl 


+ N) 


Ci2 




(float 


*) ((char 


*)Cil 


+ N) 


Cr3 




(float 


*) ((char 


*)Cr2 


+ N) 


Ci3 




(float 


*) {(char 


*)Ci2 


+ N) 



bflycnt = .(ulong) *bitrp; 
windex = /); 
index = 0; 



sent = (bflycnt == 1) ? 1 : 2; 
bflycnt -= sent; 

/* 

* loop for in-place butterflies 

*/ 

while ( sent ) { 



/* bit-reverse output */ 



/* end penultimate pass '*/ 



/* adjust pointers */ 



using cosine/sine weights (at most 2) 



LVX( aOr, Cr, index ) 

LVX( aOi, Ci, index ) 

LVX( air, Crl, index ) 

LVX( ali, Cil, index ) 

LVX( a2r, Cr2, index ) 
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LVX( a2i, Ci2, index ) 
LVX( a3r, Cr3, index ) 
LVX( a3i f Ci3, index ) 

LVX( cosl, wpO, windex ) 

LVX( sinl, wpl, windex ) 

LVX( cos2, wp2, windex ) ' 

LVX( sin2, wp3, windex ) 

/* 

* perform two (real and imaginary) 4x4 permutes 

* but swapping the resulting 2 middle columns 
*/ 

VMRGHW ( pOr, aOr, air ) 
VMRGHW ( pOir', aOi, ali ) 
VMRGHW ( plr, a2r, a3r ) 
VMRGHW { pli, a2i, a3i ) 

VMRGLW ( p2r, aOr, air ) 

VMRGLW ( p2i, aOi, ali ) 

VMRGLW ( p3r, a2r, a3r ) 

VMRGLW ( p3i, a2i, a3i ) 

VMRGHW ( aOr, pOr, plr ) 

VMRGHW ( aOi, pOi, pli ) 

VMRGLW ( air, pOr, plr ) 

VMRGLW ( ali, pOi, pli ) 

VMRGHW ( a2r, p2r, p3r ) 

VMRGHW ( a2i, p2i, P 3i ) 

VMRGLW ( a3r, p2r, p3r ) 

VMRGLW ( a3i, p2i, p3i ) 

VMADDFP ( tlr, cos2, a2r, aOr ) 
VMADDFP ( tli, cos2, a2i, aOi ) 
VNMSUBFP( m2r, cos2, a2r, aOr ) 
VNMSUBFP( m2i, cos2, a2i, aOi ) 

VMADDFP ( tlr, sin2, a2i, tlr } 
VNMSUBFP( tli, sin2, a2r, tli ) 
VNMSUBFP( m2r, sin2, a2i, m2r ) 
VMADDSP ( m2i, sin2, a2r, m2i ) 

VMADDFP ( t2r, cos2, a3r, air ) 
VMADDFP ( t2i, cos2, a3i, ali ) 
VNMSUBFP( m3r, cos2, a3r, air ) 
VNMSUBFP( m3i, cos2, a3i, ali ) 

VMADDFP ( t2r, sin2, a3i, t2r ) 
VNMSUBFP( t2i, sin2, a3r, t2i ) 
VNMSUBFP( m3r, sin2, a3i, m3r ) 
VMADDFP ( m3i, sin2, a3r, m3i ) 

VMADDFP ( yOr, cosl, t2r, tlr ) 
VMADDFP ( yOi, cosl, t2i, tli ) 
VNMSUBFP( y2r, cosl, t2r, tlr ) 
VNMSUBFP( y2i, cosl, t2i, tli ) 
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VMADDFP ( yOr, sinl, t2i, yOr ) 
VNMSUBFP( yOi, sinl, t2r, yOi ) 
VNMSUBFP( y2r, sinl, t2i, y2r ) 
VMADDFP ( y2i, sinl, t2r, y2i ) 

VNMSUBFP ( ylr, sinl, m3r, m2r ) 
VNMSUBFP ( yli, sinl, m3i, m2i ) 
VMADDFP ( y3r, sinl, m3r, m2r ) 
VMADDFP { y3i, sinl, m3i, m2i ) 

VMADDFP ( ylr, cosl, m3i, ylr ) 
VNMSUBFP ( yli, cbs'l, m3r, yli ) 
VNMSUBFP ( y3r, cosl, m3i, y3r ) 
VMADDFP ( y3,i, cosl, m3r, y3i ) 

STVX( yOr, Cr, index ) 

STVX( yOi, Ci, index ) 

STVX( ylr, Crl, index ) 

STVX( yli, Cil, index ) 

STVX( y2r, Cr2, index ) 

STVX( y2i, Ci2, index ) 

STVX{ y3r, Cr3, index ) 

STVX( y3i, Ci3, index ) 



/* no bit-reversal ! */ 



index = N - 16; 
windex = index « 2; 
sent -= 1; 



index = (ulong) *++bitrp; 
windex = index « 6; 
index «= 4; 



/* end butterfly loop */ 



* loop for remaining in-place butterflies (uses tan, cot weights 
*/ 

while { bflycnt ) { 



LVX( aOr,. Cr, index ) 
LVX ( aOi, Ci, index ) 



LVX( a/r, 

LVX( aU, 

LVX( a2r, 

LVX( a2i, 

LVX( a3r, 

LVX( a3i, 



Crl, index ) 

Cil, index ) 

Cr2, index ) 

Ci2, index ) 

Cr3, index j 

Ci3, index ) 



LVX( cosl, wpO, windex } 

LVX( tanl, wpl, windex ) 

LVX( cot2, wp2, windex ) 

LVX( sin2, wp3, windex ) 



perform two (real and imaginary) 4x4 permutes 
but swapping the resulting 2 middle columns 
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VMRGHW ( pOr, aOr, air ) 

VMRGHW ( pOi, aOi, ali ) 

VMRGHW ( plr, a2r, a3r ) 

VMRGHW ( pli, a2i, a3i ) 



VMRGLW ( p2r, 

VMRGLW ( pZi, 

VMRGLW ( p3r, 

VMRGLW ( p3i> 

VMRGHW ( aOr, 
VMRGHW ( aOi, 
VMRGLW ( air, 
VMRGLW ( ali, 

/ 

VMRGHW ( a2r, 
VMRGHW ( a2i, 
VMRGLW ( a3r, 
VMRGLW ( a3i, 



aOr, air ) 

aOi, ali ) 

a2r, a3r ) 

a2i, a3i ) 

pOr, plr ) 

pOi,. pli ) 

pOr, plr ) 

pOi, pli ) 

p2r, p3-r ) 

p2i, p3i ) 

p2r, p3r ) 

p2i, P 3i ) 



VMADDFP ( xlr, cot2, a2r, a2i ) 
VNMSUBFP( xli, cot2, a2i, a2r ) 
VMADDFP ( x2r, cot2, a3r, a3i ) 
VNMSUBFP( x2i, cot2, a3i, a3r ) 

VMADDFP ( tlr, sin2, xlr, aOr ) 
VNMSUBFP( tli, .sin2, xli, aOi ) 
VMADDFP ( t2r, sin2, x2r, air ) 
VNMSUBFP( t2i, sin2, x2i, ali ) 

VNMSUBFP( m2r, sin2, xlr, aOr ) 
VMADDFP ( m2i, sin2, xli, aOi ) 
VNMSUBFP( m3r, sin2, x2r, air ) 
VMADDFP ( m3i, sin2, x2i, ali ) 

VMADDFP ( xlr, tanl, t2i, t2r ) 
VNMSUBFP( xli, tanl, t2r, t2i ) 
VNMSUBFP( x2r, tanl, m3r, m3i ) 
VMADDFP ( x2i, tanl, m3i, m3r ) 

VMADDFP { yOr, cosl, xlr, tlr ) 
VMADDFP yOi, cosl, xli, tli ) 
VMADDFP ( ylr, cosl, x2r, m2r ) 
VNMSUBFP( yli, cosl, x2i, m2i ) 

VNMSUBFP( y2r, cosl, xlr, tlr ) 
VNMSUBFP( y2i, cosl, xli, tli ) 
VNMSUBFP( y3r, cosl, x2r, m2r ) 
VMADDFP ( y3i, cosl, x2i, ra2i ) 

STVX( yOr, Cr, index ) /* no bit-reversal 

STVX( yOi, Ci, index ) 

STVX( ylr, Crl, index ) 

STVX( yli, Cil, index ) 

STVX( y2r, Cr2, index ) 

STVX( y2i, Ci2, index ) 

STVXC y3r, Cr3, index ) 
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STVX( y3i, Ci3, index ) 

index = (ulong) *++bitrp; 
bflycnt -= 1; 
windex = index « 6; 
index «= 4; 

} .7* end butterfly loop */ 



/* 

* loop for out-of-place butterflies 
V 

bflycnt = index » 4; /* count of bit-reverse indices 

windex = 64; 

indexl = 16; 

while ( bflycnt ) { 

LVX( cosl, wpO, windex ) 

LVX( tanl, wpl, windex ) 

LVX( cot2, wp2, windex ) 

LVX( sin2, wp3, windex ) 



LVX( aOr, Cr, indexl ) 

LVX( aOi, Ci, indexl ) 

LVX( air, Crl, indexl ) 

LVX{ ali, Cil, indexl ) 

LVX( a2r, Cr2, indexl ) 

LVX( a2i, Ci2, indexl ) 

LVX( a3r, Cr3, indexl ) 

LVX( a3i, Ci3, indexl ) 



/* 

* perform two (real and imaginary) 4x4 permutes 

* but swapping the resulting 2 middle columns 
*/ 

VMRGHW( pOr, aOr, air ) 

VMRGHW ( pOi, aOi, ali ) 

VMRGHW ( plr, a2r, a3r ) 

VMRGHW (•. pli, a2i, a3i ) 



V.MRGLW( p2r, aOr, air ) 

VMRGLW (..p2i, aOi, ali ) 

VMRGLW/f p3r, a2r, a3r ) 

VMRGLW ( p3i, a2i, a3i ) 



VMRGHW ( aOr, pOr, plr ) 

VMRGHW ( aOi, pOi, pli ) 

VMRGLW ( air, pOr, plr ) 

VMRGLW ( ali, pOi, pli ) 



VMRGHW ( a2r, p2r, p3r ) 

VMRGHW ( a2i, p2i, p3i ) 

VMRGLW ( a3r, p2r, p3r ) 

VMRGLW ( a3i, p2i, p3i ) 



VMADDFP ( xlr, cot2, a2r, a2i ) 
VNMSUBFP{ xli, cot2, a2i, a2r ) 
VMADDFP ( x2r, cot2, a3r, a3i ) 
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VNMSUBFP ( x2i, cot2, a3i, a3r ) 



VMADDFP ( tlr, sin2, xlr, aOr ) 
VNMSUBFP( tli, sin2, xli, aOi ) 
VMADDFP ( t2r, sin2, x2r, air ) 
VNMSUBFP{ t2i, sin2, x2i, ali ) 

VNMSUBFP( m2r, sin2, xlr, aOr ) 
VMADDFP ( m2i, sin2, xli, aOi ) 
VNMSUBFP ( m3r, sin2, x2r,. air ) 
VMADDFP ( m3i, sin2, x2i, ali ) 

VMADDFP ( xlr, tanl, t2i, t2'r ) 
VNMSUBFP ( xli, tanl, t2r, t2i ) 
VNMSUBFP ( *2r, tanl, m3r, m3i } 
VMADDFP ( x2i, tanl, m3i, m3r ) 

VMADDFP ( yOr, cosl, xlr, tlr ) 
VMADDFP ( yOi, cosl, xli, tli ) 
VMADDFP ( ylr, cosl, x2r, m2r ) 
VNMSUBFP ( yli, cosl, x2i> m2i ) 

VNMSUBFP ( y2r, cosl, xlr, tlr ) 
VNMSUBFP ( y2i, cosl, xli, tli ) 
VNMSUBFP ( y3r, cosl, x2r, m2r ) 
VMADDFP ( y3i, cosl, x2i, m2i ) 

index2 = (ulong) *++bitrp; 
windex = index2 « 6; 
index2 «= 4; 

LVX( cosl, wpO, windex ) 

LVX( tanl, wpl, windex ) 

LVX( cot2, wp2, windex ) 

LVX( sin2, wp3, windex ) 

LVX( aOr, Cr, index2 ) 
LVX( aOi, Ci, index2 ) 
LVX( air, Crl, index2 ) 
LVX( ali, Cil, index2 ) 
LVX( a2r, Cr2, index2 ) 
LVX( a2i, Ci2, index2 ) 
LVX( d3r, Cr3, index2 ) 
LVX( a3i, Ci3, index2 ) 

STVX( yOr, Cr, index2 ) /* no bit-reversal ! */ 

STVX( yOi, Ci, index2 ) 

STVX( ylr, Crl, index2 ) 

STVX( yli, Cil, index2 ) 

STVX( y2r, Cr2, index2 ) 

STVX( y2i, Ci2, index2 ) 

STVX( y3r, Cr3, index2 ) 

STVX( y3i, Ci3, index2 ) 

/* 

* perform two (real and imaginary) 4x4 permutes 

* but swapping the resulting. 2 middle columns 
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*/ 

VMRGHW { pOr, .aOr, air ) 

VMRGHW ( pOi, aOi, ali ) 

VMRGHW ( plr, a2r, a3r ) 

VMRGHW ( pli, a2i, a3i ) 

VMRGLW ( p2r, aOr, air ) 

VMRGLW ( p2i, aOi, aii ) 

VMRGLW ( p3r, a2r, a3r ) 

VMRGLW ( P 3i, a2i, a3i ) 

VMRGHW ( aOr, pOr, plr ) ■ \ ■ 

VMRGHW ( aOi, pOi, pli ) 

VMRGLW ( air, pOr, plr ) 

VMRGLW ( a IV, pOi, pli ) 

VMRGHW ( a2r, p2r, p3r ) 

VMRGHW ( a2i, p2i, p3i )" 

VMRGLW ( a3r, p2r, p3r ) 

VMRGLW ( a3i, p2i, p3i ) 

VMADDFP ( xlr, cot2, a2r, a2i ) 
VNMSUBFP ( xli, cot2, a2i, a2r ) 
VMADDFP ( x2r, cot2, a3r, a3i ) 
VNMSUBFP ( x2i, cot2, a3i, a3r ) 

VMADDFP ( tlr, sin2, xlr, aOr ) 
VNMSUBFP( tli, sin2, xli, aOi ) 
VMADDFP ( t2r, sin2, x2r, air ) 
VNMSUBFP( t2i, sin2, x2i, ali ) 

VNMSUBFP( m2r, sin2, xlr, aOr ) 
VMADDFP ( m2i, sin2, xli, aOi ) 
VNMSUBFP( m3r, sin2, x2r, air ) 
VMADDFP ( m3i, sin2, x2i, ali ) 

VMADDFP { xlr, tanl, t2i, t2r ) 
VNMSUBFP ( xli, tanl, t2r, t2i ) 
VNMSUBFP ( x2r, tanl, m3r/ m3i ) 
VMADDFP ( x2i, tanl, ra3i, m3r ) 

VMADDFyP( yOr, cosl, xlr, tlr ) 
VMADDFP ( yOi, cosl, xli, tli ) 
VMADDFP ( ylr, cosl, x2r, m2r ) 
VNMSUBFP ( yli, cosl, x2i, m2i ) 

VNMSUBFP ( y2r, cosl, xlr, tlr ) 
VNMSUBFP ( y2i, cosl, xli, tli ) 
VNMSUBFP ( y3r, cosl, x2r, m2r ) 
VMADDFP ( y3i, cosl, x2i, m2i ) 

STVX( yOr, Cr, indexl ) /* no bit-reversal 

STVX( yOi, Ci, indexl ) 

STVX( ylr, Crl, indexl ) 

STVX( yli, Cil, indexl ) 

STVX( y2r, Cr2, indexl ) 

STVX( y2i, Ci2, indexl ) 
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STVX( y3r, Cr3, indexl ) 
STVX( y3i, Ci3, indexl ) 

indexl = (ulong) *++bitrp; 
windex = indexl « 6; 
indexl «= 4; 



bflycnt -= 2; 

} /* end butterfly loop 



/ 
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long CR[ 8 ]; 

void _lvewx( VMX_reg *vT, ulong rA, ulong rB ) 
{ 

ulong *addr; 
ulong i; 

addr = (ulong *) ( (rA) + (rB) ) ; 
i = ((ulong) addr & Oxc) » 2; 
(vT)->ul[i] = *addr; 

} 

void _lvx( VMX_reg *vT, ulong rA, ulong rB ) 
{ 

ulong *addr; 
ulong i ; 

addr = (ulong *)(((rA) + (rB) ) & -15); 
for ( i = 0; i < 4; i++ ) 
(vT)->ul[i] = addrfi]; 

} 

void _stvewx( VMX_reg *vS, ulong rA, ulong rB ) 
{ 

ulong *addr; 
ulong i; 

addr = (ulong *)((rA) + (rB) ) ; 
i = ( (ulo/ig)addr & Oxc) » 2; 
*addr = .<<vS)->ul[i]; 

} 

void _stvx( VMX reg *vS, ulong rA, ulong rB ) 
{ 

ulong *addr; 
ulong i; 

addr = (ulong *)(((rA) + (rB) ) & -15); 
for ( i = 0; i < 4; i++ ) 
addrti] = (vS.) ->ul [i] ; 

} 

void _vaddfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ) 
{ 

ulong i; 
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for { i = 0; i < 4; i++ ) 

(vT)->f[i] = (vA)->f[i] -1- (vB)->f[i]; 

} 

void _vmaddfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vC, VMX_reg *vB ) 
{ 

ulong i; 

for ( i = 0; i < 4; i++ ) 

(vT)->f[i] = ((vA)->f[i] * (vC)->f[i]) + {vB)->f[i]; 

} 

void _vmrghw( VMX_reg *vT, VMX reg *vA, VMX reg *vB ) 
{ " 
..■ VMX_reg v; 
' ulong i, j ; / 
for ( i = 0; i < 2; i++- ) { 
j = i + i; 

v.ul[j] = (vA)->ul[i]; 
v.ul[(j+l)] = (vB)->ul[i] ; 

} 

for ( i = 0; i < 4; i++ ) 
(vT)->ul[i] = v.ul[i]; 

} 

void _vmrglw( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ) 
{ 

VMX_reg v; 
ulong i, j; 

for ( i = 0; i < 2; i++ ) { 
j = i + i; 

v.ul[j] = (vA)->ul[ (2+i) ] ; 
v.ul[(j+l)] = (vB)->ul[(2+i)]; 

} 

for ( i = 0; i < 4; i++ ) 
(vT)->ul[i] = v.ulfi]; 



void ^vrasubfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vC, VMX_reg *vB ) 
{ 

ulong i; 

for ( i = 0; i < 4; i++ ) 

(vT)->f[i] = ((vA)->f[i] * (vC)->f[i]) - (vB)->f[i]; 

void _vnmsubfp( VMX reg *vT, VMX_reg *vA, VMX_reg *vC, VMX_reg *vB ) 
{ 

ulong i; 

for ( i = 0; i < 4; i++ ) 

(vT)->f[i] = -(((vA)->f[i] * (vC)->f[i]) - (vB)->fLi]); 



void _vslw( VMX_reg *vT, VMX_reg *vA, VD4X_reg *vB ) 
{ 

ulong i, sh; 

for ( i = 0; i < 4; i++ ) { 

sh = (vB)->ul[i] & (ulong) Oxlf; 
(vT)->ul[i] = (vA)->ul[i] « sh; 
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} 

} 

void _vspltisw( VMX_reg *vT, long SIMM ) 
{ 

ulong i; 

for ( i = 0; i < 4; i++ ) 

(vT)->l[i] = (long) (SIMM) ; 

} 

void _vspltw( VMX_reg *vT, VMX_reg *vB, ulong UIMM ) 
{ 

ulong i, ul; 
.. ul = (vB) ->ul [ (UIMM) & 0x3]; 
for ( i = 0; V < 4; i++ ) 
(vT)->ul[i] = ul; 

} 

void _vsubfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ) 
{ 

ulong i ; 

for ( i = 0; i < 4; i++ ) 

(vT)->f[i] = (vA)->f(i] - (vB)->f(i]; 

} 

void _vxor( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ) 
{ 

ulong i; 

for ( i = 0; i < 4; i++ ) 

(vT)->ul[i] = (vA)->ul[i] A (vB)->ul[i]; 

} 



Page 32 



/★A******************************************** 

I* File Name: ppc_vmx.h *| 

I* Description: Header file for PPC vrax (altivec) emulation *| 

I* Mercury Computer Systems, Inc. *| 

I* Copyright (c) 1999 All rights reserved . *| 

I* Revision Date Engineer; Reason *| 



991119 jg; Created *| 

tdefine uchar unsigned char 
#define ushort ^unsigned short 
#define ulong unsigned -long 

/* 

* define a structure to represent a VMX (SIMD) register 
*/ 

typedef union { 



char 


c[16]; 


uchar 


uc[16] ; 


short 


s[8]; 


ushort 


us [8]; 


long 


1[4]; 


ulong 


ul[4]; 


float 


f[4]; 


VMX_reg 





/* 

* condition register comprised of 8 4-bit fields (0-7) 
V 

extern long CR[]; 
/* 

* prototypes for functions that emulate vmx instructions 
*/ 

void _lvewx( VMX_reg *vT, ulong rA, ulong rB ); 

void _lvx( VMX_reg *vT, ulong rA, ulong rB ); 

void _stvewx{ ,VMX_reg *vS, ulong rA, ulong rB ); 

void _stvx( ^MX_reg *vS, ulong rA, ulong rB ) ; 

void _vaddfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ) ; 

void _vmaddfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vC, VMX_reg *vB ); 

void._vmrghw( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ) ; 

void _vmrglw( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ); 

void _vmsubfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vC, VMX_reg *vB ); 

void _vnmsubfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vC, VMX_reg *vB ); 

void _vslw( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ); 

void _vspltw( VMX_reg *vT, VMX_reg *vB, ulong UIMM )'; 

void _vspltisw( VMX_reg *vT, long SIMM ) ; 

void _vsubfp( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ) ; 

void _yxor ( VMX_reg *vT, VMX_reg *vA, VMX_reg *vB ) ; 

/* 

* vmx instuction macros 
*/ 
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tdefine LVEWX( vT, rA, rB ) 
#define LVX{ vT, rA, rB ) 
tdefine STVEWX( vS, rA, rB ) 
#define STVX( vS, rA, rB ) 
#define VADDFP ( vT, vA, vB ) 
tdefine VMADDFP ( vT, vA, vC, vB ) 
#define VMRGHW ( vT, vA, vB ) 
#define VMRGLW ( vT, vA, vB ) 
#define VMSUBFP( vT, vA, vC, vB ) 
Sdefine VNMSUBFP( vT, vA, vC, vB ) 
frdefine VSLW( vT, vA, vB ) 
#define VSPLTW( vT, vB, UIMM ) 
#define VSPLTISW( vT, SIMM ) 
#define VSUBFP( vT, vA, vB ) 
#define VXOR( vT,/ vA, vB ) 



_lvewx( SvT, (ulong)rA, (ulong)rB ) ; 
_lvx( SvT, (ulong)rA, (ulong) rB ) ; 
_stvewx{ SvS, (ulong) rA, (ulong) rB ); 
_stvx{ &vS, (ulong) rA, (ulong) rB ); 
_vaddfp( &vT, &vA, SvB ) ; 
_vmaddfp( &vT, &vA, &vC, &vB ) ; 
_vmrghw( &vT, &vA, &vB ) ; 
_vmrglw( &vT, &vA, SvB ) ; 
_vrosubfp( &vT, &vA, &vC, 6vB ); 
_vnmsubfp ( SvT, &vA, &vC, &vB ) ; 
_vslw( SvT, &vA, &vB ) ; 
_vspltw( &vT, &vB, UIMM ); 
_vspltisw( &vT, SIMM ) ; 
_vsubfp( SvT, &vA, &vB ) ; 
_vxor ( &vT, &vA, SvB ) ; 
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